Important: this dataset has ONLY been filtered for ‘red’ criteria for technical quality.
df <- aoc_z %>%
filter(tech.tier.zero == "N") %>% #gives studies that pass technical quality criteria
filter(risk.tier.zero == "N") %>% #only studies applicable for risk assessment. VERY RESTRICTIVE
dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master, dose.mg.L.master, dose.particles.mL.master, life.stage, bio.org, polymer, shape, size.length.um.used.for.conversions, treatments, effect, effect.metric, particle.volume.um3, density.mg.um.3, environment, exposure.route, lvl2_f)) %>%
mutate(effect.metric = as.character(effect.metric)) %>%
mutate(effect.metric = (case_when(
effect.metric == "NONEC" ~ "NOEC",
effect.metric == "HONEC" ~ "NOEC",
effect.metric == "LOEC" ~ "LOEC",
effect.metric == "LC50" ~ "LC50",
effect.metric == "EC50" ~"EC50",
effect.metric == "EC10" ~ "EC10"
))) %>%
mutate(effect.metric = replace_na(effect.metric,"not_available")) %>%
mutate(effect.metric = as.factor(effect.metric)) %>%
drop_na() %>% #drop missing
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
mutate(effect_10 = case_when( #convert ordinal to numeric
effect == "Y" ~ 1,
effect == "N" ~ 0
)) %>%
mutate(effect_10 = factor(effect_10)) %>%
dplyr::select(-(effect))
#ensure completeness
skim(df)
| Name | df |
| Number of rows | 2188 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| factor | 11 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| organism.group | 0 | 1 | FALSE | 10 | Cru: 795, Fis: 528, Mol: 433, Alg: 222 |
| lvl1_f | 0 | 1 | FALSE | 7 | Fit: 1522, Met: 485, Beh: 95, Str: 44 |
| life.stage | 0 | 1 | FALSE | 4 | Ear: 974, Adu: 759, Not: 253, Juv: 202 |
| bio.org | 0 | 1 | FALSE | 5 | org: 1163, sub: 668, cel: 251, tis: 59 |
| polymer | 0 | 1 | FALSE | 7 | PS: 1262, PE: 588, PET: 129, PP: 106 |
| shape | 0 | 1 | FALSE | 3 | sph: 1463, fra: 649, fib: 76, Not: 0 |
| effect.metric | 0 | 1 | FALSE | 3 | not: 1576, NOE: 318, LOE: 294, EC1: 0 |
| environment | 0 | 1 | FALSE | 3 | Mar: 1384, Fre: 770, Ter: 34 |
| exposure.route | 0 | 1 | FALSE | 3 | wat: 2145, med: 34, sed: 9, cop: 0 |
| lvl2_f | 0 | 1 | FALSE | 25 | Mor: 412, Rep: 368, Oxi: 288, Dev: 287 |
| effect_10 | 0 | 1 | FALSE | 2 | 0: 1504, 1: 684 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| exposure.duration.d | 0 | 1 | 0.93 | 0.73 | -1.40 | 0.48 | 1.00 | 1.32 | 1.98 | ▁▁▆▆▇ |
| dose.um3.mL.master | 0 | 1 | 5.84 | 2.34 | -5.67 | 4.42 | 6.02 | 7.37 | 14.32 | ▁▁▇▆▁ |
| dose.mg.L.master | 0 | 1 | -0.23 | 2.20 | -11.64 | -1.70 | -0.30 | 1.40 | 8.17 | ▁▁▇▆▁ |
| dose.particles.mL.master | 0 | 1 | 4.24 | 4.05 | -4.20 | 1.05 | 3.40 | 7.20 | 21.36 | ▃▇▃▁▁ |
| size.length.um.used.for.conversions | 0 | 1 | 0.71 | 1.32 | -4.00 | -0.30 | 0.78 | 1.45 | 3.70 | ▁▃▅▇▃ |
| treatments | 0 | 1 | 0.58 | 0.15 | 0.00 | 0.48 | 0.60 | 0.60 | 1.00 | ▁▁▆▇▁ |
| particle.volume.um3 | 0 | 1 | 1.60 | 3.76 | -12.28 | -1.18 | 2.05 | 3.73 | 10.21 | ▁▃▆▇▃ |
| density.mg.um.3 | 0 | 1 | -8.98 | 0.04 | -9.06 | -9.00 | -8.97 | -8.97 | -8.85 | ▃▂▇▁▁ |
# create train, validation, and test splits
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
df_split <- df %>%
initial_split(prop = 0.75)
# default is 3/4ths split (but 75% training, 25% testing).
# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
train <- training(df_split)
test <- testing(df_split)
# variable names for resonse & features
y <- "effect_10"
x <- setdiff(names(df), y)
plot(mp_classif_rf, mp_classif_glm, geom ="histogram")
plot(mp_classif_svm, mp_classif_decTree, geom ="histogram")
plot(mp_classif_nnet, mp_classif_xgbTree, geom ="histogram")
### Precision Recall Curves
plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet, mp_classif_xgbTree, geom ="prc")
# compare residuals plots
resid_dist <- plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet,
mp_classif_xgbTree) + #, mp_classif_logicBag) +
theme_minimal() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5)) +
labs(y = '')
resid_dist
#### Residuals
resid_box <- plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet, mp_classif_xgbTree,
#mp_classif_logicBag,
geom = "boxplot") +
theme_minimal() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5))
resid_box
require(gridExtra)
grid.arrange(resid_box,resid_dist, ncol=2)
Lift curves describe a performance coefficient (lift) over the cumulative proportion of a population. Lift is calculated as the ratio of “yes’s” on a certain sample point (for toxicity) divided by the ratio of “yes’s” on the whole dataset. \(Lift = Predicted Rate/Average Rate\).
http://rstudio-pubs-static.s3.amazonaws.com/436131_3212dcf341cc422590f1a9f52830cfd6.html
# transform datasets and model objects into scored data and calculate deciles
scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("train","test"),
dataset_labels = list("train data","test data"),
models = list("classif_rf", "classif_decTree",
"classif_glm", "classif_nnet",
"classif_svm", "classif_xgbTree"),
model_labels = list("random forest", "Decision
tree", "General linear model",
"Neural net", "Support Vector
machine", "eXtreme Gradient
Boosting Trees"),
target_column="effect_10",
ntiles = 100)
# transform data generated with prepare_scores_and_deciles into aggregated data for chosen plotting scope
plot_input <- plotting_scope(prepared_input = scores_and_ntiles, scope = 'compare_models')
plot_cumgains(data = plot_input, custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))
plot_cumlift(data = plot_input,custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))
plot_cumresponse(data = plot_input,highlight_ntile = 20,
custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))
plot_multiplot(data = plot_input, custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))
plot(mp_classif_rf,
#mp_classif_glm,
mp_classif_svm,
#mp_classif_decTree,
mp_classif_nnet,
#mp_classif_xgbTree,
geom = "roc") +
ggtitle("ROC Curves - All Models",
paste("AUC_rf = ",round(mp_classif_rf$measures$auc,3),
paste("AUC_svm = ",round(mp_classif_svm$measures$auc,3)),
paste("AUC_nnet = ",round(mp_classif_nnet$measures$auc,3))
)) +
#, "AUC_glm = 0.799 AUC_svm = 0.798 AUC_decTree = AUC_nnet = AUC_xgbTree = ") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
vi_classif_rf <- model_parts(explainer_classif_rf, loss_function = loss_root_mean_square)
vi_classif_glm <- model_parts(explainer_classif_glm, loss_function = loss_root_mean_square)
vi_classif_svm <- model_parts(explainer_classif_svm, loss_function = loss_root_mean_square)
vi_classif_decTree <- model_parts(explainer_classif_decTree, loss_function = loss_root_mean_square)
vi_classif_nnet <- model_parts(explainer_classif_nnet, loss_function = loss_root_mean_square)
vi_classif_xgbTree <- model_parts(explainer_classif_xgbTree, loss_function = loss_root_mean_square)
#vi_classif_logicBag <- model_parts(explainer_classif_logicBag, loss_function = loss_root_mean_square)
plot(vi_classif_rf, vi_classif_glm, vi_classif_svm)#, #vi_classif_logicBag)
plot(vi_classif_decTree, vi_classif_nnet, vi_classif_xgbTree)
pdp_classif_rf <- model_profile(explainer_classif_rf, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_glm <- model_profile(explainer_classif_glm, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_svm <- model_profile(explainer_classif_svm, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_decTree <- model_profile(explainer_classif_decTree, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_nnet <- model_profile(explainer_classif_nnet, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_xgbTree <- model_profile(explainer_classif_xgbTree, variable = "dose.um3.mL.master", type = "partial")
#pdp_classif_logicBag <- model_profile(explainer_classif_logicBag, variable = "dose.um3.mL.master", type = "partial")
plot(pdp_classif_rf, pdp_classif_glm, pdp_classif_svm, pdp_classif_decTree, pdp_classif_nnet, pdp_classif_xgbTree)#, pdp_classif_logicBag)
#Partial Dependence organism type
pdp_classif_rf <- model_profile(explainer_classif_rf, variable = "organism.group", type = "partial")
pdp_classif_glm <- model_profile(explainer_classif_glm, variable = "organism.group", type = "partial")
pdp_classif_svm <- model_profile(explainer_classif_svm, variable = "organism.group", type = "partial")
pdp_classif_decTree <- model_profile(explainer_classif_decTree, variable = "organism.group", type = "partial")
pdp_classif_nnet <- model_profile(explainer_classif_nnet, variable = "organism.group", type = "partial")
pdp_classif_xgbTree <- model_profile(explainer_classif_xgbTree, variable = "organism.group", type = "partial")
#pdp_classif_logicBag <- model_profile(explainer_classif_logicBag, variable = "dose.um3.mL.master", type = "partial")
plot(pdp_classif_rf$agr_profiles, pdp_classif_glm$agr_profiles, pdp_classif_svm$agr_profiles, pdp_classif_decTree$agr_profiles, pdp_classif_nnet$agr_profiles, pdp_classif_xgbTree$agr_profiles) +
ggtitle("Contrastive Partial Dependence Profiles", "") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
#plot neural net and glm
glm <- plot(pdp_classif_glm$agr_profiles) +
ggtitle("GLM: Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
nnet <- plot(pdp_classif_nnet$agr_profiles) +
ggtitle("Neural Net: Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(glm, nnet)
#plot rf and decision tree
rf <- plot(pdp_classif_rf$agr_profiles) +
ggtitle("Random Forest:Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
decTree <- plot(pdp_classif_decTree$agr_profiles) +
ggtitle("Decision Tree: Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(rf, decTree)
#plot rf and decision tree
svm <- plot(pdp_classif_svm$agr_profiles) +
ggtitle("Support Vector Machine:Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
xgbTree <- plot(pdp_classif_xgbTree$agr_profiles) +
ggtitle("eXtreme Gradient Boosing Trees: Partial Dependence Profiles by Dose and Effect Metric", "") +
xlab("log10(dose particles/mL)") +
ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
scale_color_discrete(name = "Model x Effect Metric") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(svm, xgbTree)
Confusion Matrices for the six tested models are below.
# fill alpha relative to sensitivity/specificity by proportional outcomes within reference groups (see dplyr code above as well as original confusion matrix for comparison)
CM_rf <- ggplot(data = plotTable_rf, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("Random Forest",
paste("Accuracy = ", 100 * round(mp_classif_rf$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
CM_glm <- ggplot(data = plotTable_glm, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("General Linear Model",
paste("Accuracy = ", 100 * round(mp_classif_glm$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
CM_svm <- ggplot(data = plotTable_svm, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("Support Vector Machine",
paste("Accuracy = ", 100 * round(mp_classif_svm$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
CM_decTree <- ggplot(data = plotTable_decTree, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("Decision Tree",
paste("Accuracy = ", 100 * round(mp_classif_decTree$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
CM_nnet <- ggplot(data = plotTable_nnet, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("Neural Net",
paste("Accuracy = ", 100 * round(mp_classif_nnet$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
CM_xgbTree <- ggplot(data = plotTable_xgbTree, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
ggtitle("eXtreme Gradient Boosting Trees",
paste("Accuracy = ", 100 * round(mp_classif_xgbTree$measures$accuracy,3), "%")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
grid.arrange(CM_rf, CM_glm, CM_svm, CM_decTree, CM_nnet, CM_xgbTree,
ncol = 2, top = "Confusion Matrices for ML Models")
# create a single observation
new_cust <- test$effect_10 %>% as.data.frame()
# compute breakdown distances
new_cust_glm <- predict_parts(explainer_classif_glm, new_observation = test, type = "break_down")
new_cust_rf <- predict_parts(explainer_classif_rf, new_observation = test, type = "break_down")
new_cust_svm <- predict_parts(explainer_classif_svm, new_observation = test, type = "break_down")
new_cust_decTree <- predict_parts(explainer_classif_decTree, new_observation = test, type = "break_down")
new_cust_nnet <- predict_parts(explainer_classif_nnet, new_observation = test, type = "break_down")
new_cust_xgbTree <- predict_parts(explainer_classif_xgbTree, new_observation = test, type = "break_down")
#new_cust_logicBag <- predict_parts(explainer_classif_logicBag, new_observation = test, type = "break_down")
# class of prediction_breakdown output
class(new_cust_rf)
## [1] "predict_parts" "break_down" "data.frame"
# check out the top 10 influential variables for this observation
new_cust_rf[1:10, 1:5]
## contribution
## : intercept 0.283
## : effect.metric = 6 -0.055
## : dose.um3.mL.master = 4.02413367973394 0.045
## : dose.mg.L.master = -1.95467702121334 -0.061
## : lvl1_f = 5 -0.018
## : treatments = 0.301029995663981 -0.005
## : shape = 4 0.023
## : dose.particles.mL.master = 3.40204507040936 0.017
## : size.length.um.used.for.conversions = 0.301029995663981 -0.014
## : bio.org = 2 -0.087
plot(new_cust_glm, new_cust_rf, new_cust_svm, new_cust_decTree, new_cust_nnet, new_cust_xgbTree)#,
#new_cust_logicBag)
library(ggplot2)
# filter for top 10 influential variables for each model and plot
list(new_cust_glm, new_cust_rf) %>%
purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
do.call(rbind, .) %>%
mutate(variable = paste0(variable, " (", label, ")")) %>%
ggplot(aes(contribution, reorder(variable, contribution))) +
geom_point() +
geom_vline(xintercept = 0, size = 3, color = "white") +
facet_wrap(~ label, scales = "free_y", ncol = 1) +
ylab(NULL)
library(ggplot2)
# filter for top 10 influential variables for each model and plot
list(new_cust_svm, new_cust_decTree) %>%
purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
do.call(rbind, .) %>%
mutate(variable = paste0(variable, " (", label, ")")) %>%
ggplot(aes(contribution, reorder(variable, contribution))) +
geom_point() +
geom_vline(xintercept = 0, size = 3, color = "white") +
facet_wrap(~ label, scales = "free_y", ncol = 1) +
ylab(NULL)
library(ggplot2)
# filter for top 10 influential variables for each model and plot
list(new_cust_nnet,new_cust_xgbTree) %>%
purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
do.call(rbind, .) %>%
mutate(variable = paste0(variable, " (", label, ")")) %>%
ggplot(aes(contribution, reorder(variable, contribution))) +
geom_point() +
geom_vline(xintercept = 0, size = 3, color = "white") +
facet_wrap(~ label, scales = "free_y", ncol = 1) +
ylab(NULL)
All in all random forest is my final model of choice: it appears the more balanced and is the most accurate overall. This model will now be tuned and refined for maximum performance.
http://rstudio-pubs-static.s3.amazonaws.com/480890_237ad52b09b6440e9c849a3c07a04d2f.html
Plot tuning.
cache = TRUE
my_plot <- function(model) {
theme_set(theme_minimal())
u <- model$results %>%
select(mtry, Accuracy, Kappa, F1, Sensitivity,
Specificity,Pos_Pred_Value, Neg_Pred_Value,
Precision, Recall, Detection_Rate) %>%
gather(a, b, -mtry)
u %>% ggplot(aes(mtry, b)) + geom_line() + geom_point() +
facet_wrap(~ a, scales = "free") +
labs(x = "Number of mtry", y = NULL,
title = "The Relationship between Model Performance and mtry")
}
rf1 %>% my_plot()
I will further refine this model using recursive feature elimination and compare accuracy with the other models.
my_ctrl <- rfeControl(functions = rfFuncs, #random forests
method = "repeatedcv",
verbose = FALSE,
repeats = 5,
returnResamp = "all")
rfProfile <- rfe(y = df$effect_10, # set dependent variable
x = df %>%
dplyr::select(-effect_10),
rfeControl = my_ctrl,
size = c(1:2, 4, 6, 8, 10, 12, 13))
rfProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.8218 0.5080 0.01679 0.05450
## 2 0.8207 0.5093 0.01771 0.05582
## 4 0.9041 0.7710 0.01758 0.04347
## 6 0.9102 0.7860 0.01643 0.04098
## 8 0.9133 0.7941 0.01638 0.04017
## 10 0.9127 0.7927 0.01762 0.04338
## 12 0.9195 0.8089 0.01590 0.03891
## 13 0.9187 0.8071 0.01613 0.03958
## 18 0.9196 0.8097 0.01579 0.03830 *
##
## The top 5 variables (out of 18):
## effect.metric, lvl2_f, dose.mg.L.master, dose.particles.mL.master, dose.um3.mL.master
The following variables are those that were picked in the final (most accurate) model.
#get variable names picked in the final model
predictors(rfProfile)
## [1] "effect.metric" "lvl2_f"
## [3] "dose.mg.L.master" "dose.particles.mL.master"
## [5] "dose.um3.mL.master" "organism.group"
## [7] "size.length.um.used.for.conversions" "particle.volume.um3"
## [9] "exposure.duration.d" "treatments"
## [11] "density.mg.um.3" "bio.org"
## [13] "shape" "environment"
## [15] "life.stage" "lvl1_f"
## [17] "polymer" "exposure.route"
The first 6 models are shown below with their corresponding accuracy and kappa values.
head(rfProfile$resample)
## Variables Accuracy Kappa .cell1 .cell2 .cell3 .cell4 Resample
## 1 1 0.8165138 0.4906542 150 0 40 28 Fold01.Rep1
## 2 2 0.8119266 0.4853161 148 2 39 29 Fold01.Rep1
## 3 4 0.9311927 0.8364182 145 5 10 58 Fold01.Rep1
## 4 6 0.9174312 0.8045038 143 7 11 57 Fold01.Rep1
## 5 8 0.9357798 0.8479474 145 5 9 59 Fold01.Rep1
## 6 10 0.9357798 0.8479474 145 5 9 59 Fold01.Rep1
trellis.par.set(caretTheme())
#plot(rfProfile, type = c("g", "o"), metric = "Accuracy")
ggplot(rfProfile) + theme_bw()
plot1 <- xyplot(rfProfile, type = c("g", "p", "smooth"), main = "Accuracy of individual resampling results")
plot2 <- densityplot(rfProfile,
subset = Variables < 5,
adjust = 1.25,
as.table = TRUE,
xlab = "Accuracy Estimates",
pch = "|")
print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 1, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
accuracy1 <- pickVars(rfProfile$variables, size = my_size)
accuracy1
## [1] "effect.metric" "lvl2_f"
## [3] "dose.mg.L.master" "dose.particles.mL.master"
## [5] "dose.um3.mL.master" "organism.group"
## [7] "size.length.um.used.for.conversions" "particle.volume.um3"
A random forest model with the above four predictors is within 1% of the best model. If a more accurate model is desired, and data is available, additional factors can be included in the model. Shown below are factors that should be included for a model that is within 0.5% accuracy of the best model.
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.5, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
accuracy0.5 <- pickVars(rfProfile$variables, size = my_size)
accuracy0.5
## [1] "effect.metric" "lvl2_f"
## [3] "dose.mg.L.master" "dose.particles.mL.master"
## [5] "dose.um3.mL.master" "organism.group"
## [7] "size.length.um.used.for.conversions" "particle.volume.um3"
## [9] "exposure.duration.d" "treatments"
## [11] "density.mg.um.3" "bio.org"
Below is a table showing which variables are included in models of varying accuracies.
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.1, maximize = TRUE)
accuracy0.1 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.3, maximize = TRUE)
accuracy0.3 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 5, maximize = TRUE)
accuracy5 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 10, maximize = TRUE)
accuracy10 <- pickVars(rfProfile$variables, size = my_size)
varsTable = list('0.1%' =accuracy0.1, '1%' = accuracy1, '5%' = accuracy5, '10%' = accuracy10)
#$make padded dataframe
na.pad <- function(x,len){
x[1:len]
}
makePaddedDataFrame <- function(l,...){
maxlen <- max(sapply(l,length))
data.frame(lapply(l,na.pad,len=maxlen),...)
}
#fancy print
kable(makePaddedDataFrame(
list('Accuracy (0.1)' = accuracy0.1, 'Accuracy (0.3)' = accuracy0.3, "Accuracy (1)" = accuracy1, 'Accuracy (5)' = accuracy5, 'Accuracy (10)' = accuracy10)
),
caption = "Variables to include in Random Forest to achieve Model Accuracy (within x% of best model)",
footnote = "Random Forest Recursive Feature Elimination")
| Accuracy..0.1. | Accuracy..0.3. | Accuracy..1. | Accuracy..5. | Accuracy..10. |
|---|---|---|---|---|
| effect.metric | effect.metric | effect.metric | effect.metric | effect.metric |
| lvl2_f | lvl2_f | lvl2_f | lvl2_f | lvl2_f |
| dose.mg.L.master | dose.mg.L.master | dose.mg.L.master | dose.mg.L.master | dose.mg.L.master |
| dose.particles.mL.master | dose.particles.mL.master | dose.particles.mL.master | dose.particles.mL.master | dose.particles.mL.master |
| dose.um3.mL.master | dose.um3.mL.master | dose.um3.mL.master | NA | NA |
| organism.group | organism.group | organism.group | NA | NA |
| size.length.um.used.for.conversions | size.length.um.used.for.conversions | size.length.um.used.for.conversions | NA | NA |
| particle.volume.um3 | particle.volume.um3 | particle.volume.um3 | NA | NA |
| exposure.duration.d | exposure.duration.d | NA | NA | NA |
| treatments | treatments | NA | NA | NA |
| density.mg.um.3 | density.mg.um.3 | NA | NA | NA |
| bio.org | bio.org | NA | NA | NA |
While the absolute best model contains 18 variables with an estimated accuracy of 93.89%, it is possible to achieve ~93% accuracy with just eight variables, as displayed in the above table. The advantage of including fewer variables is that less information is needed to accurately predict toxicity. The final model with these eight variables will be further refined through an iterative tuning process to determine the optimal number of variables to be randomly collected for sampling at each split (mtry), and the number of branches to grow after each split.Code for the following section is inspired from https://rpubs.com/phamdinhkhanh/389752.
final_df <-df %>%
dplyr::select(c(effect.metric, dose.mg.L.master, lvl2_f, dose.particles.mL.master, effect_10, dose.um3.mL.master, organism.group, exposure.duration.d, treatments)) %>%
droplevels()
#ensure completeness
skim(final_df)
| Name | final_df |
| Number of rows | 2188 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 4 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| effect.metric | 0 | 1 | FALSE | 3 | not: 1576, NOE: 318, LOE: 294 |
| lvl2_f | 0 | 1 | FALSE | 25 | Mor: 412, Rep: 368, Oxi: 288, Dev: 287 |
| effect_10 | 0 | 1 | FALSE | 2 | 0: 1504, 1: 684 |
| organism.group | 0 | 1 | FALSE | 10 | Cru: 795, Fis: 528, Mol: 433, Alg: 222 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dose.mg.L.master | 0 | 1 | -0.23 | 2.20 | -11.64 | -1.70 | -0.30 | 1.40 | 8.17 | ▁▁▇▆▁ |
| dose.particles.mL.master | 0 | 1 | 4.24 | 4.05 | -4.20 | 1.05 | 3.40 | 7.20 | 21.36 | ▃▇▃▁▁ |
| dose.um3.mL.master | 0 | 1 | 5.84 | 2.34 | -5.67 | 4.42 | 6.02 | 7.37 | 14.32 | ▁▁▇▆▁ |
| exposure.duration.d | 0 | 1 | 0.93 | 0.73 | -1.40 | 0.48 | 1.00 | 1.32 | 1.98 | ▁▁▆▆▇ |
| treatments | 0 | 1 | 0.58 | 0.15 | 0.00 | 0.48 | 0.60 | 0.60 | 1.00 | ▁▁▆▇▁ |
# create train, validation, and test splits
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
df_final_split <- final_df %>%
initial_split(prop = 0.75)
# default is 3/4ths split (but 75% training, 25% testing).
# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
train_final <- training(df_final_split)
test_final <- testing(df_final_split)
# variable names for resonse & features
y <- "effect_10"
x <- setdiff(names(final_df), y)
The mtry parameter will be optimized below:
tunegrid <- expand.grid(.mtry=(1:7))
# , .ntree=c(500, 1000, 1500, 2000, 2500)) #would like to optimize but can't figure out how!
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeat 3 times
repeats = 3)
nrow(tunegrid)
## [1] 7
set.seed(825)
tuneFit <- train(effect_10 ~ ., data = train_final,
method = "rf",
trControl = fitControl,
verbose = FALSE,
## Now specify the exact models
## to evaluate:
metric = 'Accuracy',
tuneGrid = tunegrid)
tuneFit
## Random Forest
##
## 1641 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1477, 1478, 1477, 1476, 1477, 1476, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.6867798 0.0000000
## 2 0.7576789 0.3134210
## 3 0.8135455 0.5174236
## 4 0.8696117 0.6813289
## 5 0.8864447 0.7264346
## 6 0.8957809 0.7500463
## 7 0.9000418 0.7610493
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
plot(tuneFit)
Now that mtry is optimized, the number of trees will be optimized, holding mtry constant.
# Manual Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
tunegrid <- expand.grid(.mtry=c(7)) #replace with optimal value from previous chunk
modellist <- list()
for (ntree in c(1000, 1500, 2000, 2500)) {
set.seed(123)
fit <- train(effect_10~., data=train_final, method="rf", metric='Accuracy',
tuneGrid=tunegrid, trControl=control, ntree=ntree)
key <- toString(ntree)
modellist[[key]] <- fit
}
# compare results
results <- resamples(modellist)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: 1000, 1500, 2000, 2500
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1000 0.8780488 0.8902439 0.9024390 0.9029182 0.9151515 0.9329268 0
## 1500 0.8780488 0.8902439 0.9024390 0.9031177 0.9148912 0.9272727 0
## 2000 0.8780488 0.8916093 0.9027347 0.9039283 0.9148912 0.9329268 0
## 2500 0.8780488 0.8916093 0.9027347 0.9039283 0.9148912 0.9329268 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1000 0.7059351 0.7354245 0.7625810 0.7678826 0.8003116 0.8355215 0
## 1500 0.7059351 0.7354245 0.7648221 0.7682879 0.7968108 0.8315180 0
## 2000 0.7059351 0.7389113 0.7678385 0.7702313 0.7968108 0.8355215 0
## 2500 0.7059351 0.7389113 0.7678385 0.7702313 0.7968108 0.8355215 0
dotplot(results)
Now that we have optimized the variables for our final model, we will build and save the final simplified model that has the highest accuracy.
tunegrid <- expand.grid(.mtry=c(7))
final_model <- train(effect_10~., data = train_final, method = "rf", ntree = 2500, tuneLength = 5)
# explain
yTest <- as.numeric(as.character(test_final$effect_10))
explainer_final_model <- DALEX::explain(final_model, label = "rf",
data = test_final,
y = yTest)
## Preparation of a new explainer is initiated
## -> model label : rf
## -> data : 547 rows 9 cols
## -> data : tibble converted into a data.frame
## -> target variable : 547 values
## -> predict function : yhat.train will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0 , mean = 0.2799678 , max = 0.9976
## -> model_info : package caret , ver. 6.0.86 , task classification ( [33m default [39m )
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.99 , mean = 0.03081828 , max = 0.9896
## [32m A new explainer has been created! [39m
mp_classif_final_model <- model_performance(explainer_final_model)
mp_classif_final_model
## Measures for: classification
## recall : 0.7411765
## precision : 0.919708
## f1 : 0.8208469
## accuracy : 0.8994516
## auc : 0.9550476
##
## Residuals:
## 0% 10% 20% 30% 40% 50% 60% 70%
## -0.99000 -0.14400 -0.07208 -0.03008 -0.01200 -0.00520 -0.00160 0.00360
## 80% 90% 100%
## 0.06256 0.34920 0.98960
Performance metrics for the final model are below.
plot(mp_classif_final_model,
geom = "roc") +
ggtitle("ROC Curve - Final Model",
paste("AUC = ",round(mp_classif_final_model$measures$auc,2),
paste("Accuracy = ", 100 * round(mp_classif_final_model$measures$accuracy,3), "%")
)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "none")
The tuned, final, simplified model is now validated using test set data.
test_pred <- predict(final_model, newdata = test_final)
confusionMatrix(table(test_pred, test_final$effect_10))
## Confusion Matrix and Statistics
##
##
## test_pred 0 1
## 0 366 44
## 1 11 126
##
## Accuracy : 0.8995
## 95% CI : (0.8711, 0.9234)
## No Information Rate : 0.6892
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7521
##
## Mcnemar's Test P-Value : 1.597e-05
##
## Sensitivity : 0.9708
## Specificity : 0.7412
## Pos Pred Value : 0.8927
## Neg Pred Value : 0.9197
## Prevalence : 0.6892
## Detection Rate : 0.6691
## Detection Prevalence : 0.7495
## Balanced Accuracy : 0.8560
##
## 'Positive' Class : 0
##
This confusion matrix is plotted below.
table <- data.frame(confusionMatrix(test_pred, test_final$effect_10)$table)
plotTable <- table %>%
mutate(goodbad = ifelse(table$Prediction == table$Reference, "good", "bad")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
# fill alpha relative to sensitivity/specificity by proportional outcomes within reference groups (see dplyr code above as well as original confusion matrix for comparison)
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
scale_x_discrete(labels = c("No Effect", "Effect")) +
scale_y_discrete(labels = c("No Effect", "Effect")) +
theme_bw()
Time to knit:
all_times
## $`unnamed-chunk-1`
## Time difference of 7.34167 secs
##
## $`unnamed-chunk-2`
## Time difference of 1.478028 secs
##
## $`unnamed-chunk-3`
## Time difference of 0.3949451 secs
##
## $`unnamed-chunk-4`
## Time difference of 0.02393603 secs
##
## $`All Model Training`
## Time difference of 11.44312 mins
##
## $`model performance`
## Time difference of 0.0189178 secs
##
## $`unnamed-chunk-5`
## Time difference of 0.6174459 secs
##
## $`unnamed-chunk-6`
## Time difference of 0.879647 secs
##
## $`unnamed-chunk-7`
## Time difference of 0.71808 secs
##
## $`unnamed-chunk-8`
## Time difference of 0.636302 secs
##
## $`unnamed-chunk-9`
## Time difference of 1.198795 secs
##
## $`unnamed-chunk-10`
## Time difference of 0.955446 secs
##
## $`unnamed-chunk-11`
## Time difference of 1.619668 secs
##
## $`unnamed-chunk-12`
## Time difference of 0 secs
##
## $`unnamed-chunk-13`
## Time difference of 5.024565 secs
##
## $`unnamed-chunk-14`
## Time difference of 0.8028538 secs
##
## $`unnamed-chunk-15`
## Time difference of 2.41654 secs
##
## $`unnamed-chunk-16`
## Time difference of 2.858357 secs
##
## $`unnamed-chunk-17`
## Time difference of 0.595407 secs
##
## $`unnamed-chunk-18`
## Time difference of 1.390181 mins
##
## $`unnamed-chunk-19`
## Time difference of 1.522929 secs
##
## $`unnamed-chunk-20`
## Time difference of 6.398894 secs
##
## $`unnamed-chunk-21`
## Time difference of 2.152245 secs
##
## $`unnamed-chunk-22`
## Time difference of 6.273228 secs
##
## $`unnamed-chunk-23`
## Time difference of 1.048199 secs
##
## $`unnamed-chunk-24`
## Time difference of 1.117015 secs
##
## $`unnamed-chunk-25`
## Time difference of 0.9175479 secs
##
## $`unnamed-chunk-26`
## Time difference of 0.7290499 secs
##
## $`unnamed-chunk-27`
## Time difference of 1.683497 secs
##
## $`unnamed-chunk-28`
## Time difference of 19.08298 secs
##
## $`unnamed-chunk-29`
## Time difference of 0.5614989 secs
##
## $`unnamed-chunk-30`
## Time difference of 0.6293161 secs
##
## $`unnamed-chunk-31`
## Time difference of 0.5734668 secs
##
## $`unnamed-chunk-32`
## Time difference of 0.4956751 secs
##
## $Tuning
## Time difference of 37.17174 mins
##
## $`unnamed-chunk-33`
## Time difference of 0.919852 secs
##
## $`Recursive Feature Elimination`
## Time difference of 8.896982 mins
##
## $`unnamed-chunk-34`
## Time difference of 0 secs
##
## $`unnamed-chunk-35`
## Time difference of 0.01096988 secs
##
## $`unnamed-chunk-36`
## Time difference of 0.2988698 secs
##
## $`unnamed-chunk-37`
## Time difference of 0.5814481 secs
##
## $`unnamed-chunk-38`
## Time difference of 0.02293897 secs
##
## $`unnamed-chunk-39`
## Time difference of 0.05186296 secs
##
## $`unnamed-chunk-40`
## Time difference of 0.05784392 secs
##
## $`unnamed-chunk-41`
## Time difference of 0.1785231 secs
##
## $`unnamed-chunk-42`
## Time difference of 0.02094388 secs
##
## $`unnamed-chunk-43`
## Time difference of 4.532175 mins
##
## $`unnamed-chunk-44`
## Time difference of 0.267421 secs
##
## $`unnamed-chunk-45`
## Time difference of 11.49029 mins
##
## $`unnamed-chunk-46`
## Time difference of 0.222404 secs
##
## $`Final Model`
## Time difference of 17.93435 mins
##
## $`unnamed-chunk-47`
## Time difference of 0.2544019 secs
##
## $`unnamed-chunk-48`
## Time difference of 0.1605709 secs
##
## $`unnamed-chunk-49`
## Time difference of 0.697135 secs
Machine Info:
Sys.info()[c(1:3,5)]
## sysname release version machine
## "Windows" "10 x64" "build 18363" "x86-64"
Session Info:
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3 modelplotr_1.1.0 doParallel_1.0.16 iterators_1.0.13
## [5] foreach_1.5.1 knitr_1.31 ggeffects_1.0.1 skimr_2.1.2
## [9] caret_6.0-86 tigerstats_0.3.2 abd_0.2-8 mosaic_1.8.3
## [13] ggridges_0.5.3 mosaicData_0.20.2 ggformula_0.10.1 ggstance_0.3.5
## [17] Matrix_1.3-2 lattice_0.20-41 nlme_3.1-152 forcats_0.5.1
## [21] stringr_1.4.0 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [25] tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0 DALEX_2.0.1
## [29] dplyr_1.0.4 rsample_0.0.9
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1 plyr_1.8.6
## [4] repr_1.1.3 splines_4.0.4 crosstalk_1.1.1
## [7] listenv_0.8.0 leaflet_2.0.4.1 digest_0.6.27
## [10] htmltools_0.5.1.1 magrittr_2.0.1 mosaicCore_0.9.0
## [13] ggfittext_0.9.1 recipes_0.1.15 globals_0.14.0
## [16] modelr_0.1.8 gower_0.2.2 colorspace_2.0-0
## [19] rvest_0.3.6 ggrepel_0.9.1 haven_2.3.1
## [22] xfun_0.21 crayon_1.4.1 jsonlite_1.7.2
## [25] libcoin_1.0-8 survival_3.2-7 glue_1.4.2
## [28] polyclip_1.10-0 gtable_0.3.0 ipred_0.9-9
## [31] kernlab_0.9-29 scales_1.1.1 mvtnorm_1.1-1
## [34] DBI_1.1.1 Rcpp_1.0.6 Cubist_0.2.3
## [37] Formula_1.2-4 stats4_4.0.4 lava_1.6.8.1
## [40] prodlim_2019.11.13 htmlwidgets_1.5.3 httr_1.4.2
## [43] RColorBrewer_1.1-2 ellipsis_0.3.1 pkgconfig_2.0.3
## [46] manipulate_1.0.1 farver_2.0.3 nnet_7.3-15
## [49] sass_0.3.1 dbplyr_2.1.0 tidyselect_1.1.0
## [52] labeling_0.4.2 rlang_0.4.10 reshape2_1.4.4
## [55] munsell_0.5.0 cellranger_1.1.0 tools_4.0.4
## [58] xgboost_1.3.2.1 cli_2.3.0 generics_0.1.0
## [61] sjlabelled_1.1.7 broom_0.7.4 evaluate_0.14
## [64] ggdendro_0.1.22 yaml_2.2.1 ModelMetrics_1.2.2.2
## [67] fs_1.5.0 randomForest_4.6-14 future_1.21.0
## [70] xml2_1.3.2 compiler_4.0.4 rstudioapi_0.13
## [73] e1071_1.7-4 reprex_1.0.0 tweenr_1.0.1
## [76] bslib_0.2.4 stringi_1.5.3 iBreakDown_1.3.1
## [79] highr_0.8 vctrs_0.3.6 pillar_1.4.7
## [82] lifecycle_1.0.0 furrr_0.2.2 jquerylib_0.1.3
## [85] data.table_1.13.6 insight_0.13.0 R6_2.5.0
## [88] C50_0.1.3.1 parallelly_1.23.0 codetools_0.2-18
## [91] MASS_7.3-53 assertthat_0.2.1 withr_2.4.1
## [94] hms_1.0.0 rpart_4.1-15 labelled_2.7.0
## [97] timeDate_3043.102 class_7.3-18 rmarkdown_2.7
## [100] inum_1.0-3 ingredients_2.0.1 partykit_1.2-12
## [103] ggforce_0.3.2 pROC_1.17.0.1 lubridate_1.7.9.2
## [106] base64enc_0.1-3
A work by Scott Coffin
Scott.Coffin@waterboards.ca.gov